import re,requests
from mysqlhelper import MysqlHelper
helper = MysqlHelper()
from w3lib.html import remove_tags
# https://search.51job.com/list/010000,000000,0000,32,9,99,%2B,2,1.html
# https://search.51job.com/list/010000,000000,0000,32,9,99,%2B,2,2.html

def joblist():
    url= "https://search.51job.com/list/010000,000000,0000,32,9,99,%2B,2,{}.html"
    for i in range(214,1300):

        print('执行了的页数',i)
        url = url.format(i)
        response = requests.get(url)
        # with open('qiancheng.html','wb') as f:
        #     f.write(response.content)
        text = response.text
        # print(text)
        # pat = re.compile('<span>.*</span>')
        # https://jobs.51job.com/beijing/95213826.html?s=01&t=0
        # https://jobs.51job.com/beijing/108950357.html?s=01&t=0
        pattern = 'href="(.*?)"'
        # pattern = '<span>(.*)</span>'
        m = re.findall(pattern,text)
        for i in m:
            if 'html?s=01&t=0' in i:
                # print(i)
                jobinfo(i)
                # break

def jobinfo(url):
    # url= 'https://jobs.51job.com/beijing-cyq/108954384.html?s=01&t=0'
    print(url)
    response = requests.get(url)
    with open('qiancheng.html', 'wb') as f:
        f.write(response.content)
        text = response.text
        # print(text)
        # 网站
        wangzhan = '前程5U'
        # 职位
        title_p = '<h1 title="(.*)">'
        title = re.findall(title_p, text)[0]
        # print(title)
        # 薪水
        price_p = '<strong>(.*)</strong>'
        price = re.findall(price_p, text)[1]
        # print(price)

        # 位置
        weizhi_p = 'class="msg ltype" title="(.*?)&nbsp;&nbsp;|&nbsp;&nbsp;'
        weizhi = re.search(weizhi_p, text).group(1)
        # print(weizhi.group(1))
        # 经验
        x_p = '</span>&nbsp;&nbsp;(.*?)&nbsp;&nbsp;<span>'
        wei = re.findall(x_p, text)
        # print(wei)
        jingyan = wei[0]
        # print(jingyan)
        # # 学历
        xueli = wei[1]
        # print(xueli)

        # # 时间
        shijian_p = '&nbsp;&nbsp;<span>|&nbsp;&nbsp;(.*?)发布'
        shijian = re.search(shijian_p, text).group(1)[-5:]
        # print(shijian)

        # 职位描述
        text_p = '<div class="bmsg job_msg inbox">(.*)<div class="mt10">'
        inf = re.findall(text_p,text,re.S)
        info = remove_tags(inf[0])
        info = info.replace('&nbsp', '').strip()
        # print(info)

        insert_sql = 'INSERT INTO jobtest(wangzhan,title,price,weizhi,jingyan,xueli,shijian,info) VALUES(%s,%s,%s,%s,%s,%s,%s,%s)'
        data = (wangzhan, title, price, weizhi, jingyan, xueli, shijian, info)
        helper.execute_insert_sql(insert_sql, data)



if __name__ == '__main__':
    joblist()
